In [1]:
import pandas as pd
from sklearn.exceptions import UndefinedMetricWarning 
import warnings
import matplotlib.pyplot as plt  
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import plotly.express as px

#data on github url
url_accident = "https://raw.githubusercontent.com/Sayalinale/RoadTraffic-Analysis/main/dft-road-casualty-statistics-accident-2021.csv"
url_casualty = "https://raw.githubusercontent.com/Sayalinale/RoadTraffic-Analysis/main/dft-road-casualty-statistics-casualty-2021.csv"
url_vehicle = "https://raw.githubusercontent.com/Sayalinale/RoadTraffic-Analysis/main/dft-road-casualty-statistics-vehicle-2021.csv"

# Load CSV files into Pandas  DataFrames with low_memory=False
df_accident_data = pd.read_csv(url_accident, low_memory=False)
df_casualty_data = pd.read_csv(url_casualty, low_memory=False)
df_vehicle_data = pd.read_csv(url_vehicle, low_memory=False)
In [2]:
# Merge the DataFrames on the 'accident_reference' column
merged_data = pd.merge(df_accident_data, df_casualty_data, on='accident_reference')
merged_data = pd.merge(merged_data, df_vehicle_data, on='accident_reference')

# Now, 'merged_data' contains the merged data from all three DataFrames based on 'accident_reference'
In [3]:
print(merged_data.columns)
print(merged_data.shape)
Index(['accident_index_x', 'accident_year_x', 'accident_reference',
       'location_easting_osgr', 'location_northing_osgr', 'longitude',
       'latitude', 'police_force', 'accident_severity', 'number_of_vehicles',
       'number_of_casualties', 'date', 'day_of_week', 'time',
       'local_authority_district', 'local_authority_ons_district',
       'local_authority_highway', 'first_road_class', 'first_road_number',
       'road_type', 'speed_limit', 'junction_detail', 'junction_control',
       'second_road_class', 'second_road_number',
       'pedestrian_crossing_human_control',
       'pedestrian_crossing_physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions',
       'special_conditions_at_site', 'carriageway_hazards',
       'urban_or_rural_area', 'did_police_officer_attend_scene_of_accident',
       'trunk_road_flag', 'trunk_road_flag.1', 'accident_index_y',
       'accident_year_y', 'vehicle_reference_x', 'casualty_reference',
       'casualty_class', 'sex_of_casualty', 'age_of_casualty',
       'age_band_of_casualty', 'casualty_severity', 'pedestrian_location',
       'pedestrian_movement', 'car_passenger', 'bus_or_coach_passenger',
       'pedestrian_road_maintenance_worker', 'casualty_type',
       'casualty_home_area_type', 'casualty_imd_decile', 'lsoa_of_casualty',
       'accident_index', 'accident_year', 'vehicle_reference_y',
       'vehicle_type', 'towing_and_articulation', 'vehicle_manoeuvre',
       'vehicle_direction_from', 'vehicle_direction_to',
       'vehicle_location_restricted_lane',
       'vehicle_location_restricted_lane.1', 'skidding_and_overturning',
       'hit_object_in_carriageway', 'vehicle_leaving_carriageway',
       'hit_object_off_carriageway', 'first_point_of_impact',
       'vehicle_left_hand_drive', 'journey_purpose_of_driver', 'sex_of_driver',
       'age_of_driver', 'age_band_of_driver', 'engine_capacity_cc',
       'propulsion_code', 'age_of_vehicle', 'generic_make_model',
       'driver_imd_decile', 'driver_home_area_type', 'lsoa_of_driver',
       'Unnamed: 28'],
      dtype='object')
(246314, 82)
In [4]:
# Check for null values in the merged DataFrame
null_values = merged_data.isnull()

# Count the number of null values in each column
null_counts = null_values.sum()

# Print the null counts for all columns
for column, count in null_counts.items():
    print(f"Column '{column}': {count} missing values")
Column 'accident_index_x': 0 missing values
Column 'accident_year_x': 0 missing values
Column 'accident_reference': 0 missing values
Column 'location_easting_osgr': 43 missing values
Column 'location_northing_osgr': 43 missing values
Column 'longitude': 43 missing values
Column 'latitude': 43 missing values
Column 'police_force': 0 missing values
Column 'accident_severity': 0 missing values
Column 'number_of_vehicles': 0 missing values
Column 'number_of_casualties': 0 missing values
Column 'date': 0 missing values
Column 'day_of_week': 0 missing values
Column 'time': 0 missing values
Column 'local_authority_district': 0 missing values
Column 'local_authority_ons_district': 0 missing values
Column 'local_authority_highway': 0 missing values
Column 'first_road_class': 0 missing values
Column 'first_road_number': 0 missing values
Column 'road_type': 0 missing values
Column 'speed_limit': 0 missing values
Column 'junction_detail': 0 missing values
Column 'junction_control': 0 missing values
Column 'second_road_class': 0 missing values
Column 'second_road_number': 0 missing values
Column 'pedestrian_crossing_human_control': 0 missing values
Column 'pedestrian_crossing_physical_facilities': 0 missing values
Column 'light_conditions': 0 missing values
Column 'weather_conditions': 0 missing values
Column 'road_surface_conditions': 0 missing values
Column 'special_conditions_at_site': 0 missing values
Column 'carriageway_hazards': 0 missing values
Column 'urban_or_rural_area': 0 missing values
Column 'did_police_officer_attend_scene_of_accident': 0 missing values
Column 'trunk_road_flag': 0 missing values
Column 'trunk_road_flag.1': 0 missing values
Column 'accident_index_y': 0 missing values
Column 'accident_year_y': 0 missing values
Column 'vehicle_reference_x': 0 missing values
Column 'casualty_reference': 0 missing values
Column 'casualty_class': 0 missing values
Column 'sex_of_casualty': 0 missing values
Column 'age_of_casualty': 0 missing values
Column 'age_band_of_casualty': 0 missing values
Column 'casualty_severity': 0 missing values
Column 'pedestrian_location': 0 missing values
Column 'pedestrian_movement': 0 missing values
Column 'car_passenger': 0 missing values
Column 'bus_or_coach_passenger': 0 missing values
Column 'pedestrian_road_maintenance_worker': 0 missing values
Column 'casualty_type': 0 missing values
Column 'casualty_home_area_type': 0 missing values
Column 'casualty_imd_decile': 0 missing values
Column 'lsoa_of_casualty': 0 missing values
Column 'accident_index': 0 missing values
Column 'accident_year': 0 missing values
Column 'vehicle_reference_y': 0 missing values
Column 'vehicle_type': 0 missing values
Column 'towing_and_articulation': 0 missing values
Column 'vehicle_manoeuvre': 0 missing values
Column 'vehicle_direction_from': 0 missing values
Column 'vehicle_direction_to': 0 missing values
Column 'vehicle_location_restricted_lane': 0 missing values
Column 'vehicle_location_restricted_lane.1': 0 missing values
Column 'skidding_and_overturning': 0 missing values
Column 'hit_object_in_carriageway': 0 missing values
Column 'vehicle_leaving_carriageway': 0 missing values
Column 'hit_object_off_carriageway': 0 missing values
Column 'first_point_of_impact': 0 missing values
Column 'vehicle_left_hand_drive': 0 missing values
Column 'journey_purpose_of_driver': 0 missing values
Column 'sex_of_driver': 0 missing values
Column 'age_of_driver': 0 missing values
Column 'age_band_of_driver': 0 missing values
Column 'engine_capacity_cc': 0 missing values
Column 'propulsion_code': 0 missing values
Column 'age_of_vehicle': 0 missing values
Column 'generic_make_model': 0 missing values
Column 'driver_imd_decile': 0 missing values
Column 'driver_home_area_type': 0 missing values
Column 'lsoa_of_driver': 0 missing values
Column 'Unnamed: 28': 246314 missing values
In [5]:
# List of columns to keep
columns_to_keep = [
    'accident_reference',
    'location_easting_osgr',
    'location_northing_osgr',
    'longitude',
    'latitude',
    'police_force',
    'accident_severity',
    'number_of_vehicles',
    'date',
    'day_of_week',
    'time',
    'local_authority_district',
    'first_road_class',
    'first_road_number',
    'road_type',
    'speed_limit',
    'junction_detail',
    'junction_control',
    'second_road_class',
    'second_road_number',
    'pedestrian_crossing_human_control',
    'pedestrian_crossing_physical_facilities',
    'light_conditions',
    'weather_conditions',
    'road_surface_conditions',
    'special_conditions_at_site',
    'carriageway_hazards',
    'did_police_officer_attend_scene_of_accident',
    'trunk_road_flag',
    'trunk_road_flag.1',
    'vehicle_reference_x',
    'casualty_class',
    'sex_of_casualty',
    'age_of_casualty',
    'age_band_of_casualty',
    'casualty_severity',
    'pedestrian_location',
    'pedestrian_movement',
    'car_passenger',
    'bus_or_coach_passenger',
    'pedestrian_road_maintenance_worker',
    'casualty_type',
    'casualty_home_area_type',
    'casualty_imd_decile',
    'vehicle_type',
    'towing_and_articulation',
    'vehicle_manoeuvre',
    'vehicle_direction_from',
    'vehicle_direction_to',
    'vehicle_location_restricted_lane',
    'vehicle_location_restricted_lane.1',
    'skidding_and_overturning',
    'hit_object_in_carriageway',
    'vehicle_leaving_carriageway',
    'hit_object_off_carriageway',
    'first_point_of_impact',
    'vehicle_left_hand_drive',
    'journey_purpose_of_driver',
    'sex_of_driver',
    'age_of_driver',
    'age_band_of_driver',
    'engine_capacity_cc',
    'propulsion_code',
    'age_of_vehicle',
    'driver_imd_decile',
    'driver_home_area_type',
    'lsoa_of_driver'
]

# Keep only the specified columns
merged_data = merged_data[columns_to_keep]

# Remove rows with null values
merged_data = merged_data.dropna()
In [6]:
merged_data.shape
Out[6]:
(246271, 67)
In [7]:
import pandas as pd
import plotly.express as px

fig = px.density_mapbox(merged_data, 
                        lat='latitude', 
                        lon='longitude', 
                        radius=8,
                        center=dict(lat=merged_data['latitude'].mean(), lon=merged_data['longitude'].mean()),
                        zoom=6,
                        mapbox_style='open-street-map',
                        color_continuous_scale = 'rainbow',
                        title='Spatial Heatmap of Accidents')

fig.show()
In [8]:
# Downsample the data to reduce memory usage (adjust the fraction as needed)
downsampled_data = merged_data.sample(frac=0.1, random_state=42)

# Selecting relevant features for clustering
features_for_clustering = downsampled_data[['road_type']]

# Standardize the features
scaler = StandardScaler()
features_for_clustering_scaled = scaler.fit_transform(features_for_clustering)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(features_for_clustering_scaled)

# Add the clusters to the DataFrame
downsampled_data['cluster'] = clusters

# Plot the scatter map with clusters
fig = px.scatter_mapbox(downsampled_data, 
                        lat='latitude', 
                        lon='longitude', 
                        color='cluster',
                        center=dict(lat=downsampled_data['latitude'].mean(), lon=downsampled_data['longitude'].mean()),
                        zoom=10,
                        mapbox_style='open-street-map',
                        title='DBSCAN Clusters on Map with Road Type')

fig.show()
In [10]:
# Downsample the data to reduce memory usage (adjust the fraction as needed)
downsampled_data = merged_data.sample(frac=0.1, random_state=42)

# Selecting relevant features for clustering
features_for_clustering = downsampled_data[['weather_conditions']]

# Standardize the features
scaler = StandardScaler()
features_for_clustering_scaled = scaler.fit_transform(features_for_clustering)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(features_for_clustering_scaled)

# Add the clusters to the DataFrame
downsampled_data['cluster'] = clusters

# Plot the scatter map with clusters
fig = px.scatter_mapbox(downsampled_data, 
                        lat='latitude', 
                        lon='longitude', 
                        color='cluster',
                        center=dict(lat=downsampled_data['latitude'].mean(), lon=downsampled_data['longitude'].mean()),
                        zoom=10,
                        mapbox_style='open-street-map',
                        title='DBSCAN Clusters on Map with Road Type')

fig.show()
In [11]:
# Downsample the data to reduce memory usage (adjust the fraction as needed)
downsampled_data = merged_data.sample(frac=0.1, random_state=42)

# Selecting relevant features for clustering
features_for_clustering = downsampled_data[['road_surface_conditions']]

# Standardize the features
scaler = StandardScaler()
features_for_clustering_scaled = scaler.fit_transform(features_for_clustering)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=3)
clusters = dbscan.fit_predict(features_for_clustering_scaled)

# Add the clusters to the DataFrame
downsampled_data['cluster'] = clusters

# Plot the scatter map with clusters
fig = px.scatter_mapbox(downsampled_data, 
                        lat='latitude', 
                        lon='longitude', 
                        color='cluster',
                        center=dict(lat=downsampled_data['latitude'].mean(), lon=downsampled_data['longitude'].mean()),
                        zoom=6,
                        mapbox_style='open-street-map',
                        title='DBSCAN Clusters on Map with Road Surface Conditions')

fig.show()
In [ ]: